SOFTWARE PARA EL ANÁLISIS DE DATOS (SAD)

MÁSTER UNIVERSITARIO EN BIOINFORMÁTICA Y BIOESTADÍSTICA

Preliminares

library(tidyverse)
## -- Attaching packages ---------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.0.3
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
#Cargo ambos datasets. Añadir explicación de qué contienen.
BCG_strain <-
  read_csv("task_2-BCG_strain_per_country-1Nov2020.csv")
## Warning: Missing column names filled in: 'X20' [20], 'X21' [21], 'X22' [22],
## 'X23' [23], 'X24' [24], 'X25' [25], 'X26' [26]
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   X20 = col_logical(),
##   X21 = col_logical(),
##   X22 = col_logical(),
##   X23 = col_logical(),
##   X24 = col_logical(),
##   X25 = col_logical(),
##   X26 = col_logical()
## )
## See spec(...) for full column specifications.
#Elimino columnas que sean sólo NA
BCG_strain <- BCG_strain[,apply(!is.na(BCG_strain),2, all)]
#De momento, no me interesa qué vacunas se ponían cada año, sino si se ponían o no.
#Transformo los valores de cada año en
#0-No se ponía vacuna, hasta ahora None
#1-Sí se ponía vacuna
#NA, Este dato es desconocido, hasta ahora Unknown

BCG_strain_no_strain <- BCG_strain

#Transformo los valores de las columnas
BCG_strain_no_strain[,-1] <- sapply(BCG_strain_no_strain[,-1], function(x){
  a <- gsub("None", 0, x) %>% gsub("Unknown", NA, .) #Añado los 0 y los NA.
  for (i in 1:length(a)){ #Serán 1 aquellos que no sean ni 0 ni NA
    if (a[i] != "0" && !is.na(a[i])){
      a[i] <- 1
    }
  }
  return(as.integer(a)) #Cambio las columnas a integer
  })

COVID_noformat <-
  read_csv(
    "task_2-COVID-19-death_cases_per_country_after_fifth_death-till_22_September_2020.csv"
  )
## Parsed with column specification:
## cols(
##   .default = col_character()
## )
## See spec(...) for full column specifications.
#Elimino columnas que sean sólo NA
COVID_noNA <- COVID_noformat[,apply(!is.na(COVID_noformat),2, all)]

COVID_Na <- sapply(COVID_noNA, function(x)gsub("NULL", NA, x))

COVID_Na_df <- as.data.frame(COVID_Na)

COVID_Na_df[,c("date_fifth_death")] <- as.Date(COVID_Na_df[,c( "date_fifth_death")], "%d/%m/%y")

COVID_Na_df[,c("date_first_death")] <- as.Date(COVID_Na_df[,c( "date_first_death")], "%d/%m/%y")

              
COVID_Na_df[,-c(1,2,3,4)] <- sapply(COVID_Na_df[,-c(1,2,3,4)], as.numeric)

COVID_BGC <- left_join(BCG_strain_no_strain, COVID_Na_df, by = "country_name")


#Reduzco los colnames, son my largos

colnames(COVID_BGC) <- gsub("mandatory_bcg_strain", "strain", colnames(COVID_BGC)) %>%
  gsub("deaths_per_million", "dpm", .)%>%
  gsub("days_after_fifth_death", "d", .)%>%
  gsub("stringency_index", "si", .)
cormat <- cor(COVID_BGC %>% select(-c("country_name", "alpha_3_code", "date_first_death", "date_fifth_death")) %>% na.omit())

cormat2 <- cormat
cormat2[upper.tri(cormat2)] <- NA #Para visualizar solamente una vez las correlaciones
cormat2 <- melt(round(cormat2, 2)) #Formato para poder usar ggplot
ggplot(cormat2, aes(x=Var1, y=Var2, fill=value)) + geom_tile()+scale_fill_continuous(type = "viridis")

fig <- plot_ly(x = colnames(cormat), y = colnames(cormat), z = cormat, type = "heatmap")

fig